# Setup environment ----
library(parallel)
library(data.table)
source("R/functions.R")
results_path <- "results/replication/"
load("data/data-2021-02-22.RData")
house_data_lag <- copy(house_data)

#     House data ----
house_data <- house_data[
  !is.na(real_dem_expenditure_advantage) &
  !is.na(ideological_distance) &
  !is.na(quality_challenger) &
  unopposed == 0 &
  thirdother == 0]
hou_tail_lims <- house_data[,
  quantile(real_dem_expenditure_advantage_w_outside, c(.05, .95), na.rm = TRUE)]
hou_X <- data.matrix(house_data[, .(
  dem_spend_adv = real_dem_expenditure_advantage_w_outside *
    as.numeric(
      real_dem_expenditure_advantage_w_outside >= hou_tail_lims[1] &
        real_dem_expenditure_advantage_w_outside <= hou_tail_lims[2]),
  log_total_spending = log10(real_total_spending_w_outside),
  adj_dem_presvote_advantage, ideological_distance, dem_cfscore, rep_cfscore,
  # house only
  dem_inc_lq_chal, dem_inc_hq_chal, rep_inc_lq_chal, rep_inc_hq_chal,
  open_both_hi, open_hi_dem_lo_rep, open_lo_dem_hi_rep,
  y80, y82, y84, y86, y88,
  y90, y92, y94, y96, y98,
  y00, y02, y04, y06, y08,
  y10, y12, y14, y16, y18,
  bottom_tail =
    as.numeric(real_dem_expenditure_advantage_w_outside < hou_tail_lims[1]),
  top_tail =
    as.numeric(real_dem_expenditure_advantage_w_outside > hou_tail_lims[2]),
  bottom_tail_DSA = real_dem_expenditure_advantage_w_outside *
    as.numeric(real_dem_expenditure_advantage_w_outside < hou_tail_lims[1]),
  top_tail_DSA = real_dem_expenditure_advantage_w_outside *
    as.numeric(real_dem_expenditure_advantage_w_outside > hou_tail_lims[2]),
  middle_90 =
    as.numeric(
      real_dem_expenditure_advantage_w_outside <= hou_tail_lims[2] &
        real_dem_expenditure_advantage_w_outside >= hou_tail_lims[1]))])
hou_y <- house_data[, cbind(dem_vote_share)]
#       data for randomForest ----
hou_X_rf <- data.matrix(house_data[, .(
  dem_spend_adv = real_dem_expenditure_advantage_w_outside,
  log_total_spending = log10(real_total_spending_w_outside),
  adj_dem_presvote_advantage, ideological_distance, dem_cfscore, rep_cfscore,
  dem_inc_lq_chal = as.numeric(contest_type == 1),
  dem_inc_hq_chal = as.numeric(contest_type == 2),
  rep_inc_lq_chal = as.numeric(contest_type == 3),
  rep_inc_hq_chal = as.numeric(contest_type == 4),
  open_both_hi = as.numeric(contest_type == 5),
  open_hi_dem_lo_rep = as.numeric(contest_type == 6),
  open_lo_dem_hi_rep = as.numeric(contest_type == 7),
  y80, y82, y84, y86, y88,
  y90, y92, y94, y96, y98,
  y00, y02, y04, y06, y08,
  y10, y12, y14, y16, y18
)])
#       dichotomous outcome variable ----
hou_y_01 <- house_data[, cbind(as.numeric(dem_vote_share> .5))]
#       data with lagged outcomes ----
house_data_lag <- merge(house_data_lag[redistricted == 0],
  house_data_lag[, .(year = year + 2, district,
    lag_outcome = dem_vote_share,
    lag_DSA = real_dem_expenditure_advantage_w_outside)],
  by = c("year", "district"), all.x = TRUE)
house_data_lag <- house_data_lag[!year %in% c(1980, 1982, 1992, 2002, 2012)]
house_data_lag <- house_data_lag[!is.na(real_dem_expenditure_advantage) &
    !is.na(ideological_distance) & !is.na(quality_challenger) &
    !is.na(contest_type) & unopposed == 0 & thirdother == 0 &
    !is.na(lag_outcome)]
hou_X_lag <- data.matrix(house_data_lag[, .(
  dem_spend_adv = real_dem_expenditure_advantage_w_outside *
    as.numeric(
      real_dem_expenditure_advantage_w_outside >= hou_tail_lims[1] &
        real_dem_expenditure_advantage_w_outside <= hou_tail_lims[2]),
  log_total_spending = log10(real_total_spending_w_outside),
  adj_dem_presvote_advantage, ideological_distance, dem_cfscore, rep_cfscore,
  # house only
  dem_inc_lq_chal = as.numeric(contest_type == 1),
  dem_inc_hq_chal = as.numeric(contest_type == 2),
  rep_inc_lq_chal = as.numeric(contest_type == 3),
  rep_inc_hq_chal = as.numeric(contest_type == 4),
  y84, y86,
  y88, y90,
  y94,
  y96, y98,
  y00,
  y04, y06,
  y08, y10,
  y14,
  y16, y18,
  bottom_tail =
    as.numeric(real_dem_expenditure_advantage_w_outside < hou_tail_lims[1]),
  top_tail =
    as.numeric(real_dem_expenditure_advantage_w_outside > hou_tail_lims[2]),
  bottom_tail_DSA = real_dem_expenditure_advantage_w_outside *
    as.numeric(real_dem_expenditure_advantage_w_outside < hou_tail_lims[1]),
  top_tail_DSA = real_dem_expenditure_advantage_w_outside *
    as.numeric(real_dem_expenditure_advantage_w_outside > hou_tail_lims[2]),
  middle_90 =
    as.numeric(
      real_dem_expenditure_advantage_w_outside <= hou_tail_lims[2] &
        real_dem_expenditure_advantage_w_outside >= hou_tail_lims[1]),
  lag_outcome = lag_outcome)])
hou_y_lag <- house_data_lag[, cbind(dem_vote_share)]

#       setup data for marginal effects ----
xH <- hou_X[, 1]
dxH <- setstep(xH)
Xp1H <- make_plus_dx_dataset(hou_X, dxH, "hou")
Xm1H <- make_plus_dx_dataset(hou_X, -dxH, "hou")
#       for CV ----
set.seed(1546625538)
hou_folds <- make_train_and_test_sets(length(hou_y), 5)

# save ----
save(
  hou_tail_lims,
  house_data, hou_X, hou_y,
  hou_X_rf,
  hou_y_01,
  house_data_lag, hou_X_lag, hou_y_lag,
  xH, dxH, Xp1H, Xm1H,
  hou_folds,
  file = paste0(results_path, "house_analysis_data.RData")
)

